Code
DataTransformerRegistry.enable('default')
DataTransformerRegistry.enable('default')
df = read_parquet_and_reorder("df.parquet")
logger.info(df.shape)
df_per_100g = df.select("code", *[c for c in df.columns if c.endswith("_100g")])
df = df.select(c for c in df.columns if not c.endswith("_100g"))
columns = [
"categories_en",
"ingredients_tags",
"ingredients_analysis_tags",
"traces_en",
"food_groups_en",
"nutrient_levels_tags",
"main_category_en",
"packaging_en",
]
df_dict: dict[str, pl.DataFrame] = {
c: df.pipe(one_hot_encode, c, n=10, remove_prefix=["en:", "de:"]) for c in columns
} | {"nutrients": df_per_100g}[06/30/23 09:12:04] INFO (73307, 175) 2632950586.py:2
[06/30/23 09:12:05] INFO (73307, 11) one_hot_encode.py:58
INFO (73307, 11) one_hot_encode.py:58
INFO (73307, 11) one_hot_encode.py:58
INFO (73307, 11) one_hot_encode.py:58
INFO (73307, 11) one_hot_encode.py:58
INFO (73307, 11) one_hot_encode.py:58
INFO (73307, 11) one_hot_encode.py:58
INFO (73307, 11) one_hot_encode.py:58
INFO categories_en 288844683.py:3
INFO ingredients_tags 288844683.py:3
INFO ingredients_analysis_tags 288844683.py:3
INFO traces_en 288844683.py:3
INFO food_groups_en 288844683.py:3
INFO nutrient_levels_tags 288844683.py:3
INFO main_category_en 288844683.py:3
INFO packaging_en 288844683.py:3
INFO nutrients 288844683.py:3
transformer = Normalizer().fit(X)
X_train, X_test, y_train, y_test = train_test_split(
transformer.transform(X), y, test_size=0.20, random_state=2023
)
# clf = tree.DecisionTreeClassifier(max_depth=15)
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)
df_tree = pl.concat(
[
pl.DataFrame(
{
"actual score": y_test,
"predicted score": clf.predict(X_test),
"label": "test",
}
),
pl.DataFrame(
{
"actual score": y_train,
"predicted score": clf.predict(X_train),
"label": "train",
}
),
]
).with_columns(err=pl.col("predicted score") - pl.col("actual score"))